In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as style
import plotly.graph_objects as go
import plotly.express as px
sns.set_theme(style='whitegrid')
In [2]:
# personal
df = pd.read_csv("~/Nextcloud/linkedin_recruiter/inputs/model_input_2021-02-09.csv")
# for windows
# df = pd.read_csv("N:/johnson/linkedin_recruiter/inputs/model_input_2021-02-09.csv")
In [3]:
log_cols = ['flow', 'distance', 'users_orig', 'users_dest', 'pop_orig', 'pop_dest',
            'maxgdp_orig', 'maxgdp_dest', 'area_orig', 'area_dest']
for col in log_cols:
    df[f'log{col}'] = np.log10(df[col])
In [4]:
wide_vars = pd.get_dummies(df[['iso3_orig', 'iso3_dest']], prefix=['orig', 'dest'])
wdf = df.join(wide_vars)
In [10]:
# linear correlations
numeric_cols = [
    'logflow','logdistance', 'logusers_orig', 'logusers_dest', 'prop_users_orig', 'prop_users_dest',
    'logpop_orig', 'logpop_dest', 'logarea_orig', 'logarea_dest', 'logmaxgdp_orig',
    'logmaxgdp_dest', 'internet_orig', 'internet_dest'
]
matrix = np.triu(df[numeric_cols].corr())
fig, ax = plt.subplots(figsize=(11.7, 8.27))
sns.heatmap(df[numeric_cols].corr(), mask=matrix, cmap='coolwarm')
# plt.savefig("/Users/scharlottej13/Nextcloud/linkedin_recruiter/outputs/correlation_matrix.png")
Out[10]:
<AxesSubplot:>
In [6]:
fig = px.scatter(df.sort_values(by='query_date'), x='prop_users_orig', y='logflow', size='pop_orig',
                facet_col='query_date', facet_col_wrap=4, hover_data=['country_orig', 'country_dest'])
fig.show()
In [7]:
fig = px.scatter(df.sort_values(by='query_date'), x='prop_users_dest', y='logflow', size='pop_orig',
                facet_col='query_date', facet_col_wrap=4, hover_data=['country_orig', 'country_dest'])
fig.show()
In [8]:
fig = px.scatter(
    wdf.sort_values(by='query_date').query("query_date == '2020-07-25'"),
    x='logusers_dest', y='logflow', color='orig_usa',
    hover_data=['country_orig', 'country_dest', 'flow']
)
fig.show()
In [9]:
sns.pairplot(
    df.sort_values(by='query_date'),
    vars=["logflow", "logdistance", "logusers_orig", "logusers_dest",
          "logpop_dest", "logpop_orig", "prop_users_orig", "prop_users_dest"],
    hue='query_date', palette='crest', diag_kws=dict(fill=False))
# plt.savefig("/Users/scharlottej13/Nextcloud/linkedin_recruiter/outputs/correlation_matrix.png")
Out[9]:
<seaborn.axisgrid.PairGrid at 0x7fcb12311e50>
In [ ]: